1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 package build.tools.generatebreakiteratordata;
32
33 import java.io.BufferedReader;
34 import java.io.BufferedWriter;
35 import java.io.FileReader;
36 import java.io.FileWriter;
37 import java.util.StringTokenizer;
38
39 class CharacterCategory {
40
41
42
43
44 static final String[] categoryNames = {
45 "Ll",
46 "Lu",
47 "Lt",
48 "Lo",
49 "Lm",
50 "Nd",
51 "Nl",
52 "No",
53 "Ps",
54 "Pe",
55 "Pi",
56 "Pf",
57 "Pd",
58 "Pc",
59 "Po",
60 "Sc",
61 "Sm",
62 "So",
63 "Mn",
64 "Mc",
65 "Me",
66 "Zl",
67 "Zp",
68 "Zs",
69 "Cc",
70 "Cf",
71 "--",
72
73 };
74
75
76
77
78 private static int[][] categoryMap;
79
80
81
82
83
84 static void makeCategoryMap(String filename) {
85
86 specfile = filename;
87
88
89 generateNewData();
90
91
92 categoryMap = new int[categoryNames.length-1][];
93 for (int i = 0; i < categoryNames.length-1; i++) {
94 int len = newListCount[BMP][i] + newListCount[nonBMP][i];
95 categoryMap[i] = new int[len];
96 System.arraycopy(newList[i], 0, categoryMap[i], 0, len);
97 }
98 }
99
100
101
102
103 static int[] getCategoryMap(int category) {
104 return categoryMap[category];
105 }
106
107
108
109
110
111 public static void main(String[] args) {
112
113 processArgs(args);
114
115
116 generateNewData();
117
118
119
120
121
122 if (!oldDatafile.equals("")) {
123 generateOldData();
124 generateOldDatafile();
125 }
126
127
128 showSummary();
129
130
131
132
133
134
135 generateTestProgram();
136 }
137
138
139
140
141
142 private static String specfile = "UnicodeData.txt";
143
144
145
146
147 private static String outputDir = "";
148
149
150
151
152 private static String oldDatafile = "";
153
154
155
156
157 private static void processArgs(String[] args) {
158 for (int i = 0; i < args.length; i++) {
159 String arg =args[i];
160 if (arg.equals("-spec")) {
161 specfile = args[++i];
162 } else if (arg.equals("-old")) {
163 oldDatafile = args[++i];
164 } else if (arg.equals("-o")) {
165 outputDir = args[++i];
166 } else {
167 System.err.println("Usage: java CharacterCategory [-spec specfile]");
168 System.exit(1);
169 }
170 }
171 }
172
173
174
175
176
177 private static void showSummary() {
178 int oldSum = 0;
179 int newSum = 0;
180 int oldSuppSum = 0;
181 int newSuppSum = 0;
182
183 for (int i = 0; i < categoryNames.length-1; i++) {
184 int newNum = newListCount[BMP][i] + newListCount[nonBMP][i];
185
186 if (oldTotalCount[i] != newNum) {
187 System.err.println("Error: The number of generated data is different between the new approach and the old approach.");
188 }
189 if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) {
190 System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach.");
191 }
192
193 System.out.println(" " + categoryNames[i] + ": " +
194 oldTotalCount[i] +
195 "(" + oldListCount[BEFORE][i] +
196 " + " + oldListCount[SURROGATE][i] +
197 " + " + oldListCount[AFTER][i] + ")" +
198 " --- " + newNum +
199 "(" + newListCount[BMP][i] +
200 " + " + newListCount[nonBMP][i] + ")");
201
202 oldSum += oldListCount[BEFORE][i] * 2 +
203 oldListCount[SURROGATE][i] * 4 +
204 oldListCount[AFTER][i] * 2;
205 newSum += newNum * 4 ;
206 oldSuppSum += oldListCount[SURROGATE][i] * 4;
207 newSuppSum += newListCount[nonBMP][i] * 4;
208 }
209
210 System.out.println("\nTotal buffer sizes are:\n " +
211 oldSum + "bytes(Including " + oldSuppSum +
212 "bytes for supplementary characters)\n " +
213 newSum + "bytes(Including " + newSuppSum +
214 "bytes for supplementary characters)");
215
216 if (!ignoredOld.toString().equals(ignoredNew.toString())) {
217 System.err.println("Ignored categories: Error: List mismatch: " +
218 ignoredOld + " vs. " + ignoredNew);
219 } else {
220 System.out.println("\nIgnored categories: " + ignoredOld);
221 System.out.println("Please confirm that they aren't used in BreakIteratorRules.");
222 }
223 }
224
225
226 private static final int HighSurrogate_CodeUnit_Start = 0xD800;
227 private static final int LowSurrogate_CodeUnit_Start = 0xDC00;
228 private static final int Supplementary_CodePoint_Start = 0x10000;
229
230
231 private static StringBuffer ignoredOld = new StringBuffer();
232 private static int[] oldTotalCount = new int[categoryNames.length];
233 private static int[][] oldListCount = new int[3][categoryNames.length];
234 private static int[][] oldListLen = new int[3][categoryNames.length];
235 private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length];
236
237 private static final int BEFORE = 0;
238 private static final int SURROGATE = 1;
239 private static final int AFTER = 2;
240
241
242
243
244
245 private static void generateOldData() {
246
247 for (int i = 0; i<categoryNames.length; i++) {
248 for (int j = BEFORE; j <= AFTER; j++) {
249 oldListCount[j][i] = 0;
250 oldList[j][i] = new StringBuffer();
251 oldListLen[j][i] = 17;
252 }
253 }
254
255 storeOldData();
256
257 if (oldTotalCount[categoryNames.length-1] != 1) {
258 System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
259 System.exit(1);
260 }
261 }
262
263 private static void storeOldData() {
264 try {
265 FileReader fin = new FileReader(specfile);
266 BufferedReader bin = new BufferedReader(fin);
267
268 String prevCode = "????";
269 String line;
270 int prevIndex = categoryNames.length - 1;
271 int prevCodeValue = -1;
272 int curCodeValue = 0;
273 boolean setFirst = false;
274
275 while ((line = bin.readLine()) != null) {
276 if (line.length() == 0) {
277 continue;
278 }
279
280 StringTokenizer st = new StringTokenizer(line, ";");
281 String code = st.nextToken();
282
283 char c = code.charAt(0);
284 if (c == '#' || c == '/') {
285 continue;
286 }
287
288 int i = Integer.valueOf(code, 16).intValue();
289
290 String characterName = st.nextToken();
291 String category = st.nextToken();
292
293 int index;
294 for (index = 0; index < categoryNames.length; index++) {
295 if (category.equals(categoryNames[index])) {
296 break;
297 }
298 }
299
300 if (index != categoryNames.length) {
301 curCodeValue = Integer.parseInt(code, 16);
302 if (prevIndex != index) {
303 appendOldChar(prevIndex, prevCodeValue, prevCode);
304 appendOldChar(index, curCodeValue, code);
305 prevIndex = index;
306 } else if (prevCodeValue != curCodeValue - 1) {
307 if (setFirst && characterName.endsWith(" Last>")) {
308 setFirst = false;
309 } else {
310 appendOldChar(prevIndex, prevCodeValue, prevCode);
311 appendOldChar(index, curCodeValue, code);
312 }
313 }
314 prevCodeValue = curCodeValue;
315 prevCode = code;
316 if (characterName.endsWith(" First>")) {
317 setFirst = true;
318 }
319 } else {
320 if (ignoredOld.indexOf(category) == -1) {
321 ignoredOld.append(category);
322 ignoredOld.append(' ');
323 }
324 }
325 }
326 appendOldChar(prevIndex, prevCodeValue, prevCode);
327
328 bin.close();
329 fin.close();
330 }
331 catch (Exception e) {
332 throw new InternalError(e.toString());
333 }
334 }
335
336 private static void appendOldChar(int index, int code, String s) {
337 int range;
338 if (code < HighSurrogate_CodeUnit_Start) {
339 range = BEFORE;
340 } else if (code < Supplementary_CodePoint_Start) {
341 range = AFTER;
342 } else {
343 range = SURROGATE;
344 }
345
346 if (oldListLen[range][index] > 64) {
347 oldList[range][index].append("\"\n + \"");
348 oldListLen[range][index] = 19;
349 }
350
351 if (code == 0x22 || code == 0x5c) {
352 oldList[range][index].append('\\');
353 oldList[range][index].append((char)code);
354 oldListLen[range][index] += 2;
355 } else if (code > 0x20 && code < 0x7F) {
356 oldList[range][index].append((char)code);
357 oldListLen[range][index] ++;
358 } else {
359 if (range == SURROGATE) {
360 oldList[range][index].append(toCodeUnit(code));
361 oldListLen[range][index] += 12;
362 } else {
363 oldList[range][index].append("\\u");
364 oldList[range][index].append(s);
365 oldListLen[range][index] += 6;
366 }
367 }
368 oldListCount[range][index] ++;
369 oldTotalCount[index]++;
370 }
371
372 private static String toCodeUnit(int i) {
373 StringBuffer sb = new StringBuffer();
374 sb.append("\\u");
375 sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase());
376 sb.append("\\u");
377 sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase());
378 return sb.toString();
379 }
380
381 private static int toCodePoint(String s) {
382 char c1 = s.charAt(0);
383
384 if (s.length() == 1 || !Character.isHighSurrogate(c1)) {
385 return (int)c1;
386 } else {
387 char c2 = s.charAt(1);
388 if (s.length() != 2 || !Character.isLowSurrogate(c2)) {
389 return -1;
390 }
391 return Character.toCodePoint(c1, c2);
392 }
393 }
394
395
396 private static StringBuffer ignoredNew = new StringBuffer();
397 private static int[] newTotalCount = new int[categoryNames.length];
398 private static int[][] newListCount = new int[2][categoryNames.length];
399 private static int[][] newList = new int[categoryNames.length][];
400
401 private static final int BMP = 0;
402 private static final int nonBMP = 1;
403
404
405
406
407 private static void generateNewData() {
408
409 for (int i = 0; i<categoryNames.length; i++) {
410 newList[i] = new int[10];
411 }
412
413 storeNewData();
414
415 if (newListCount[BMP][categoryNames.length-1] != 1) {
416 System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
417 System.exit(1);
418 }
419 }
420
421 private static void storeNewData() {
422 try {
423 FileReader fin = new FileReader(specfile);
424 BufferedReader bin = new BufferedReader(fin);
425
426 String line;
427 int prevIndex = categoryNames.length - 1;
428 int prevCodeValue = -1;
429 int curCodeValue = 0;
430 boolean setFirst = false;
431
432 while ((line = bin.readLine()) != null) {
433 if (line.length() == 0) {
434 continue;
435 }
436
437 StringTokenizer st = new StringTokenizer(line, ";");
438 String code = st.nextToken();
439
440 char c = code.charAt(0);
441 if (c == '#' || c == '/') {
442 continue;
443 }
444
445 int i = Integer.valueOf(code, 16).intValue();
446
447 String characterName = st.nextToken();
448 String category = st.nextToken();
449
450 int index;
451 for (index = 0; index < categoryNames.length; index++) {
452 if (category.equals(categoryNames[index])) {
453 break;
454 }
455 }
456
457 if (index != categoryNames.length) {
458 curCodeValue = Integer.parseInt(code, 16);
459 if (prevIndex == index) {
460 if (setFirst) {
461 if (characterName.endsWith(" Last>")) {
462 setFirst = false;
463 } else {
464 System.err.println("*** Error 1 at " + code);
465 }
466 } else {
467 if (characterName.endsWith(" First>")) {
468 setFirst = true;
469 } else if (characterName.endsWith(" Last>")) {
470 System.err.println("*** Error 2 at " + code);
471 } else {
472 if (prevCodeValue != curCodeValue - 1) {
473 appendNewChar(prevIndex, prevCodeValue);
474 appendNewChar(index, curCodeValue);
475 }
476 }
477 }
478 } else {
479 if (setFirst) {
480 System.err.println("*** Error 3 at " + code);
481 } else if (characterName.endsWith(" First>")) {
482 setFirst = true;
483 } else if (characterName.endsWith(" Last>")) {
484 System.err.println("*** Error 4 at " + code);
485 }
486 appendNewChar(prevIndex, prevCodeValue);
487 appendNewChar(index, curCodeValue);
488 prevIndex = index;
489 }
490 prevCodeValue = curCodeValue;
491 } else {
492 if (ignoredNew.indexOf(category) == -1) {
493 ignoredNew.append(category);
494 ignoredNew.append(' ');
495 }
496 }
497 }
498 appendNewChar(prevIndex, prevCodeValue);
499
500 bin.close();
501 fin.close();
502 }
503 catch (Exception e) {
504 System.err.println("Error occurred on accessing " + specfile);
505 e.printStackTrace();
506 System.exit(1);
507 }
508 }
509
510 private static void appendNewChar(int index, int code) {
511 int bufLen = newList[index].length;
512 if (newTotalCount[index] == bufLen) {
513 int[] tmpBuf = new int[bufLen + 10];
514 System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen);
515 newList[index] = tmpBuf;
516 }
517
518 newList[index][newTotalCount[index]++] = code;
519 if (code < 0x10000) {
520 newListCount[BMP][index]++;
521 } else {
522 newListCount[nonBMP][index]++;
523 }
524 }
525
526
527
528 private static void generateOldDatafile() {
529 try {
530 FileWriter fout = new FileWriter(oldDatafile);
531 BufferedWriter bout = new BufferedWriter(fout);
532
533 bout.write("\n //\n // The following String[][] can be used in CharSet.java as is.\n //\n\n private static final String[][] categoryMap = {\n");
534 for (int i = 0; i < categoryNames.length - 1; i++) {
535 if (oldTotalCount[i] != 0) {
536 bout.write(" { \"" + categoryNames[i] + "\",");
537
538
539 if (oldListCount[BEFORE][i] != 0) {
540 bout.write(" \"");
541
542 bout.write(oldList[BEFORE][i].toString() + "\"\n");
543 }
544
545
546 if (oldListCount[AFTER][i] != 0) {
547 if (oldListCount[BEFORE][i] != 0) {
548 bout.write(" + \"");
549 } else {
550 bout.write(" \"");
551 }
552 bout.write(oldList[AFTER][i].toString() + "\"\n");
553 }
554
555
556 if (oldListCount[SURROGATE][i] != 0) {
557 if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) {
558 bout.write(" + \"");
559 } else {
560 bout.write(" \"");
561 }
562 bout.write(oldList[SURROGATE][i].toString() + "\"\n");
563 }
564 bout.write(" },\n");
565
566 }
567 }
568 bout.write(" };\n\n");
569 bout.close();
570 fout.close();
571 }
572 catch (Exception e) {
573 System.err.println("Error occurred on accessing " + oldDatafile);
574 e.printStackTrace();
575 System.exit(1);
576 }
577
578 System.out.println("\n" + oldDatafile + " has been generated.");
579 }
580
581
582
583
584
585 private static final String outfile = "CharacterCategoryTest.java";
586
587
588
589
590
591 private static void generateTestProgram() {
592 try {
593 FileWriter fout = new FileWriter(outfile);
594 BufferedWriter bout = new BufferedWriter(fout);
595
596 bout.write(collationMethod);
597 bout.write("\n //\n // The following arrays can be used in CharSet.java as is.\n //\n\n");
598
599 bout.write(" private static final String[] categoryNames = {");
600 for (int i = 0; i < categoryNames.length - 1; i++) {
601 if (i % 10 == 0) {
602 bout.write("\n ");
603 }
604 bout.write("\"" + categoryNames[i] + "\", ");
605 }
606 bout.write("\n };\n\n");
607
608 bout.write(" private static final int[][] categoryMap = {\n");
609
610 for (int i = 0; i < categoryNames.length - 1; i++) {
611 StringBuffer sb = new StringBuffer(" { /* Data for \"" + categoryNames[i] + "\" category */");
612
613 for (int j = 0; j < newTotalCount[i]; j++) {
614 if (j % 8 == 0) {
615 sb.append("\n ");
616 }
617 sb.append(" 0x");
618 sb.append(Integer.toString(newList[i][j], 16).toUpperCase());
619 sb.append(',');
620 }
621 sb.append("\n },\n");
622 bout.write(sb.toString());
623 }
624
625 bout.write(" };\n");
626
627 bout.write("\n}\n");
628
629 bout.close();
630 fout.close();
631 }
632 catch (Exception e) {
633 System.err.println("Error occurred on accessing " + outfile);
634 e.printStackTrace();
635 System.exit(1);
636 }
637
638 System.out.println("\n" + outfile + " has been generated.");
639 }
640
641 static String collationMethod =
642 "public class CharacterCategoryTest {\n\n" +
643 " static final int SIZE = 0x110000;\n" +
644 " static final String[] category = {\n" +
645 " \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" +
646 " \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" +
647 " \"Cf\", \"\", \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" +
648 " \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" +
649 " };\n\n" +
650 " public static void main(String[] args) {\n" +
651 " boolean err = false;\n" +
652 " byte[] b = new byte[SIZE];\n" +
653 " for (int i = 0; i < SIZE; i++) {\n" +
654 " b[i] = 0;\n" +
655 " }\n" +
656 " for (int i = 0; i < categoryMap.length; i++) {\n" +
657 " byte categoryNum = 0;\n" +
658 " String categoryName = categoryNames[i];\n" +
659 " for (int j = 0; j < category.length; j++) {\n" +
660 " if (categoryName.equals(category[j])) {\n" +
661 " categoryNum = (byte)j;\n" +
662 " break;\n" +
663 " }\n" +
664 " }\n" +
665 " int[] values = categoryMap[i];\n" +
666 " for (int j = 0; j < values.length;) {\n" +
667 " int firstChar = values[j++];\n" +
668 " int lastChar = values[j++];\n" +
669 " for (int k = firstChar; k <= lastChar; k++) {\n" +
670 " b[k] = categoryNum;\n" +
671 " }\n" +
672 " }\n" +
673 " }\n" +
674 " for (int i = 0; i < SIZE; i++) {\n" +
675 " int characterType = Character.getType(i);\n" +
676 " if (b[i] != characterType) {\n" +
677 " /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" +
678 " if (characterType == Character.PRIVATE_USE ||\n" +
679 " characterType == Character.SURROGATE ||\n" +
680 " characterType == Character.MODIFIER_SYMBOL) {\n" +
681 " continue;\n" +
682 " }\n" +
683 " err = true;\n" +
684 " System.err.println(\"Category conflict for a character(0x\" +\n" +
685 " Integer.toHexString(i) +\n" +
686 " \"). CharSet.categoryMap:\" +\n" +
687 " category[b[i]] +\n" +
688 " \" Character.getType():\" +\n" +
689 " category[characterType]);\n" +
690 " }\n" +
691 " }\n\n" +
692 " if (err) {\n" +
693 " throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" +
694 " }\n" +
695 " }\n";
696
697 }